Amiga Format CD 42

home *** CD-ROM | disk | FTP | other *** search

/ Amiga Format CD 42 / Amiga Format AFCD42 (Issue 126, Aug 1999).iso / -serious- / programming / c / awin / cpu5azure2.asm < prev next >

Wrap

Assembly Source File | 1999-05-17 | 10KB | 508 lines

; FILE: GG:src/own/awin/cpu5azure2.ASM REV: 10 --- fast 040/060 c2p by Azure ; LINK: >LEAVEOBJ> ; History ; 0 This is supposed to be cleaner :) ; 7 Made it dynamic. ; 10 Made it mostly(!) dynamic about plane count, ; you still *MUST* pass all 8 planes!! ; ;--------------------------------------------------------------------------- ;5 Pass CPU Chunky to Planar Converter for 68040/60 ; ; This c2p is enhanced for slow 060-boards like the Blizzard 1260 and ; Apollo 1260. ; ; Tested: Copyspeed on Apollo 1240/40, Apollo 4040/40 , Apollo 1260/50 ; its probably copyspeed on all 4060 boards, all 1240/40 boards and ; all 4040/40 boards. On slower 040 boards it should perform well, ; too. I hope it is copyspeed on Blizzard 1260/50,too. ; ;(W) and (C) 6-7.5.1997 by Tim Boescke ; Azure/Artwork ; ; ;This converter is using enhanced and paired mergeops (rot-merges) taking ;3 cycles per merge on 060, 8 cycles per merge on 040. Plus a little overhead ;for final rot-correction. The disadvantage is, that the 16bit merge is slighty ;slower now on 040. ; ;Effective cycles taken for 8lw c2p: ; ; 040 060 ; ;rot paired merge 169 61.5 ;normal paired merge 168 68 ;non paired merge 168 ~132-152 ; XDEF _awinitchunky2planar XDEF _awchunky2planar ;void awinitchunky2planar(UBYTE *chunky __asm("a0"), ; ULONG width __asm("d0"), ; ULONG height __asm("d1"), ; ULONG depth __asm("d2")); ;void awchunky2planar(UBYTE *planar __asm("a1")); USEA7 equ 0 ;1 = use a7 ,0 = dont use a7 ;NoA7 uses selfmodifying code with cacheflush ;its applied every time you change the chunkybuffers ;location. (a0=source) So changing it a lot of times ;could slow the c2p down a bit. ;--------------------------------------------------------------------------- ; ;IN: ; ; a0 =source ; d0 =width ; d1 =height ; d2 =depth ; ;---------------------------------------------------------------------------- CNOP 0,8 _awinitchunky2planar: movem.l d0-d7/a0-a6,-(a7) movem.l d0-d2/a0-a1,-(a7) move.l .alignbase(pc),d0 bne.b .norealign ; Align the c2p to a 16 byte-border move.l #_alignhere,d0 and.w #$fff0,d0 move.l d0,a0 move.l a0,.alignbase lea _alignhere(pc),a1 move.w #_loopend-_alignhere,d0 lsr.w #1,d0 ;phxass is drain bamaged .reloop move.w (a1)+,(a0)+ subq.w #1,d0 bne.b .reloop ; Get original chip writes lea .cwtab(pc),a0 move.l .alignbase(pc),a1 move.w (_smc_p0-_alignhere,a1),(a0)+ move.w (_smc_p1-_alignhere,a1),(a0)+ move.w (_smc_p2-_alignhere,a1),(a0)+ move.w (_smc_p3-_alignhere,a1),(a0)+ move.w (_smc_p4-_alignhere,a1),(a0)+ move.w (_smc_p5-_alignhere,a1),(a0)+ move.w (_smc_p6-_alignhere,a1),(a0)+ move.w (_smc_p7-_alignhere,a1),(a0)+ .norealign movem.l (a7)+,d0-d2/a0-a1 mulu.w d1,d0 ;d0=screensize move.l d0,d1 lsr.l #3,d1 ;d1=plane (screensize/8) move.l a0,_smcsrca0+2 lea (a0,d0.l),a0 IFNE USEA7 move.l a0,_smcherea7+2 ENDC ; must use .alignbase for aligned area!! ; (ie. for things between _loop and _loopend) move.l .alignbase(pc),a5 IFEQ USEA7 move.l a0,(_endsmc+2-_alignhere,a5) ENDC ;d1=plane (screensize/8) move.l d1,(_smc_plane01+2-_alignhere,a5) move.l d1,(_smc_plane02+2-_alignhere,a5) move.l d1,(_smc_plane03+2-_alignhere,a5) move.l d1,_smc_plane04+2 move.l d1,_smc_plane05+2 move.l d1,_smc_plane06+2 move.l d1,d3 add.l d1,d3 ;d3=2*plane move.l d3,(_smc_2plane01+2-_alignhere,a5) move.l d3,_smc_2plane02+2 move.l d3,d4 addq.l #4,d4 ;d4=2*plane+4 move.l d4,(_smc_2plane401+2-_alignhere,a5) move.l d3,d4 add.l d4,d4 ;d4=4*plane move.l d4,(_smc_4plane01+2-_alignhere,a5) add.l d1,d4 subq.l #4,d4 ;d4=5*plane-4 move.l d4,_smc_5plane401+2 move.l d1,d3 add.l d1,d3 add.l d1,d3 ;d3=3*plane move.l d3,(_smc_3plane01+2-_alignhere,a5) move.l d3,d4 add.l d4,d4 ;d4=6*plane move.l d4,(_smc_6plane01+2-_alignhere,a5) lea .cwtab(pc),a0 move.w (a0)+,(_smc_p0-_alignhere,a5) move.w (a0)+,(_smc_p1-_alignhere,a5) move.w (a0)+,(_smc_p2-_alignhere,a5) move.w (a0)+,(_smc_p3-_alignhere,a5) move.w (a0)+,(_smc_p4-_alignhere,a5) move.w (a0)+,(_smc_p5-_alignhere,a5) move.w (a0)+,(_smc_p6-_alignhere,a5) move.w (a0)+,(_smc_p7-_alignhere,a5) cmp.l #8,d2 bls.b .dok moveq #8,d2 .dok move.w #$2048,d0 ;move.l a0,a0 jmp .djpos(pc,d2.l*4) .djpos move.w d0,(_smc_p0-_alignhere,a5) move.w d0,(_smc_p1-_alignhere,a5) move.w d0,(_smc_p2-_alignhere,a5) move.w d0,(_smc_p3-_alignhere,a5) move.w d0,(_smc_p4-_alignhere,a5) move.w d0,(_smc_p5-_alignhere,a5) move.w d0,(_smc_p6-_alignhere,a5) move.w d0,(_smc_p7-_alignhere,a5) move.l 4.w,a6 jsr -636(a6) ;cacheflush movem.l (a7)+,d0-d7/a0-a6 rts .alignbase dc.l 0 .cwtab dc.w 0,0,0,0 dc.w 0,0,0,0 ; ;IN: ; ; (a0 =source) ; a1 =target ; ;NOTE!!!! : Dont use any optimizations when assembling this! Especially ; not with PHXass. The generated code might not work otherwise. ;--------------------------------------------------------------------------- CNOP 0,8 _awchunky2planar: movem.l d0-d7/a2-a6,-(a7) _smcsrca0 move.l #$1337C0DE,a0 IFNE USEA7 move.l a7,_a7save _smcherea7 move.l #$BADC0DE,a7 ;a7=endpointer ENDC _smc_5plane401 add.l #$C0DE7,a1 ;5*.plane-4,a1 move.l (a0)+,d0 move.l (a0)+,d1 move.l (a0)+,d2 move.l (a0)+,d3 move.l (a0)+,d4 move.l (a0)+,a3 move.l (a0)+,d6 move.l (a0)+,a2 swap d4 swap d6 eor.w d0,d4 eor.w d2,d6 eor.w d4,d0 eor.w d6,d2 eor.w d0,d4 eor.w d2,d6 ror.l #8,d2 rol.l #8,d4 move.l d6,d7 move.l d2,d5 eor.l d4,d7 eor.l d0,d5 and.l #$00FF00FF,d5 and.l #$FF00FF00,d7 eor.l d5,d0 eor.l d7,d4 eor.l d5,d2 eor.l d7,d6 rol.l #6,d4 rol.l #6,d6 move.l d4,d5 move.l d6,d7 eor.l d0,d5 eor.l d2,d7 and.l #$33333333,d5 and.l #$33333333,d7 eor.l d5,d0 eor.l d7,d2 eor.l d5,d4 eor.l d7,d6 rol.l #4,d2 rol.l #4,d6 move.l a2,d7 move.l a3,d5 move.l d6,a2 move.l d4,a3 swap d5 swap d7 eor.w d1,d5 eor.w d3,d7 eor.w d5,d1 eor.w d7,d3 eor.w d1,d5 eor.w d3,d7 ror.l #8,d3 rol.l #8,d5 move.l d7,d6 move.l d3,d4 eor.l d5,d6 eor.l d1,d4 and.l #$00FF00FF,d4 and.l #$FF00FF00,d6 eor.l d4,d1 eor.l d6,d5 eor.l d4,d3 eor.l d6,d7 rol.l #6,d5 rol.l #6,d7 move.l d5,d4 move.l d7,d6 eor.l d1,d4 eor.l d3,d6 and.l #$33333333,d4 and.l #$33333333,d6 eor.l d4,d1 eor.l d6,d3 eor.l d4,d5 eor.l d6,d7 ror.l #4,d1 ror.l #4,d5 move.l a2,d6 move.l d5,a2 REPT 4 ;space for realigning move.l a1,a1 ;pipelined/superscalar nop move.l a2,a2 ;(Note: the real NOP is more than a ; No-Operation. It does Pipeline-Sync and ; is dead slow that way) ;asm-one isnt assembling trapf ENDR _alignhere bra.w _enter_here REPT 3 move.l a1,a1 move.l a2,a2 ENDR _loop move.l (a0)+,d0 move.l (a0)+,d1 move.l (a0)+,d2 move.l (a0)+,d3 move.l (a0)+,d4 move.l (a0)+,a3 move.l (a0)+,d6 move.l (a0)+,a2 _smc_p0 move.l d7,(a1) ;plane0 swap d4 swap d6 eor.w d0,d4 eor.w d2,d6 eor.w d4,d0 eor.w d6,d2 eor.w d0,d4 eor.w d2,d6 _smc_plane01 add.l #$C0DE1,a1 ror.l #8,d2 rol.l #8,d4 _smc_p1 move.l d5,(a1) ;plane1 move.l d6,d7 move.l d2,d5 eor.l d4,d7 eor.l d0,d5 and.l #$00FF00FF,d5 and.l #$FF00FF00,d7 eor.l d5,d0 eor.l d7,d4 eor.l d5,d2 eor.l d7,d6 rol.l #6,d4 rol.l #6,d6 _smc_plane02 add.l #$C0DE1,a1 move.l d4,d5 move.l d6,d7 eor.l d0,d5 eor.l d2,d7 and.l #$33333333,d5 and.l #$33333333,d7 eor.l d5,d0 eor.l d7,d2 eor.l d5,d4 _smc_p2 move.l a4,(a1) ;plane2 eor.l d7,d6 rol.l #4,d2 rol.l #4,d6 move.l a2,d7 move.l a3,d5 move.l d6,a2 move.l d4,a3 swap d5 swap d7 eor.w d1,d5 eor.w d3,d7 eor.w d5,d1 eor.w d7,d3 eor.w d1,d5 eor.w d3,d7 _smc_2plane01 add.l #$C0DE2,a1 ror.l #8,d3 rol.l #8,d5 move.l d7,d6 move.l d3,d4 eor.l d5,d6 _smc_p4 move.l a5,(a1) ;plane4 eor.l d1,d4 and.l #$00FF00FF,d4 and.l #$FF00FF00,d6 eor.l d4,d1 eor.l d6,d5 eor.l d4,d3 eor.l d6,d7 rol.l #6,d5 rol.l #6,d7 move.l d5,d4 move.l d7,d6 eor.l d1,d4 eor.l d3,d6 _smc_plane03 add.l #$C0DE1,a1 and.l #$33333333,d4 and.l #$33333333,d6 eor.l d4,d1 eor.l d6,d3 eor.l d4,d5 eor.l d6,d7 ror.l #4,d1 ror.l #4,d5 move.l a2,d6 move.l d5,a2 _smc_p5 move.l a6,(a1) ;plane5 _ent